###############################################################################
###### Rabia Malik & Svanhildur Thorvaldsdottir
###### Are Goodwill Ambassadors Good for Business?
###### Generating Dataset for Analysis
###############################################################################

library(foreign)
library(dplyr)

# loading this "original_dataset" to clean;
# this is the original Qualtrics file pre-cleaning

## insert your own file path here
ga <- read.csv("~/Original_Dataset.csv")


###########################
##
## Creating all the outcome variables
##
###########################

###########
# givetoUNICF_dum creation
###########


# Org q: If you were giving money to charity, would you give to UNICEF? 1 is Yes
# 2 is No, would give to other (separate column for which), 3 is No I don't give
# to charity.

# As there are 8 possible treatment groups, the responses are spread
# across 8 possible columns. From the original dataset, we know
# that TX.3 is this question, where X goes from 1 to 8 for each of the
# 8 groups. So, we start by creating a variable called givetoUNICEF,
# which simply takes the value of the right column and puts it in one
# column so we have all respondents' answers to that question in a 
# single column. Due to NAs, this had to be done in a loop to
# avoid running into errors.

ga$givetoUNICEF <- NA
for (i in 1:nrow(ga)) {
  ga$givetoUNICEF[i] <- sum(ga$T1.3[i], ga$T2.3[i], ga$T3.3[i], ga$T4.3[i],
                           ga$T5.3[i], ga$T6.3[i], ga$T7.3[i], ga$T8.3[i],
                           na.rm=TRUE)
}
table(ga$givetoUNICEF)

# Now, code final dummy to use in analysis, givetoUNICEF_dum, which
# is 1 if the person said 1 above, and 0 otherwise
ga$givetoUNICEF_dum <- ifelse(ga$givetoUNICEF==1,1,0)
table(ga$givetoUNICEF_dum)
#


###########
# givetoCause_dum creation
###########

# Org q: If you were giving money to charity, would you give to
# [the issue you saw]? 1 is Yes, 2 is No, would give to other 
# (separate column for which), 3 is No I don't give
# to charity.

# The coding logic is the same as the previous variable, with
# 8 possible groups. This is question TX.4 from the original
# dataset with headers so we use that accordingly to create
# givetoCause, which is then used to create a dummy variable
# version givetoCause_dum, which is used in the final analysis

ga$givetoCause <- NA
for (i in 1:nrow(ga)) {
  ga$givetoCause[i] <- sum(ga$T1.4[i], ga$T2.4[i], ga$T3.4[i], ga$T4.4[i],
                            ga$T5.4[i], ga$T6.4[i], ga$T7.4[i], ga$T8.4[i],
                            na.rm=TRUE)
}
table(ga$givetoCause)

# creating dummy variable version
ga$givetoCause_dum <- ifelse(ga$givetoCause==1,1,0)
table(ga$givetoCause_dum)


###########
# amountToUNICEF creation
###########

# Amount you'd keep for self (out of $100)

# The coding logic is the same as the two previous variables, 
# with 8 possible groups. This is question TX.5 from the original
# dataset with headers so we use that accordingly to create
# amountToUNICEF, which is used in the final analysis

ga$amountToUNICEF <- NA
for (i in 1:nrow(ga)) {
  ga$amountToUNICEF[i] <- sum(ga$T1.5_2[i], ga$T2.5_2[i], ga$T3.5_2[i], ga$T4.5_2[i],
                           ga$T5.5_2[i], ga$T6.5_2[i], ga$T7.5_2[i], ga$T8.5_2[i],
                           na.rm=TRUE)
}

summary(ga$amountToUNICEF)

###########
# learnAboutUNICEF_dum creation
###########

# Whether you'd like to learn more about UNICEF and its causes: 1 is yes
# to learning about UNICEF's work on issue, 2 is yes on UNICEF in general, 3 is no

# The coding logic is the same as the two previous variables, 
# with 8 possible groups. This is question TX.6 from the original
# dataset with headers so we use that accordingly to create
# learnAboutUNICEF, which is used to creat the dummy variable
# learnAboutUNICEF_dum, which is used in the final analysis

ga$learnAboutUNICEF <- NA
for (i in 1:nrow(ga)) {
  ga$learnAboutUNICEF[i] <- sum(ga$T1.6[i], ga$T2.6[i], ga$T3.6[i], ga$T4.6[i],
                            ga$T5.6[i], ga$T6.6[i], ga$T7.6[i], ga$T8.6[i],
                            na.rm=TRUE)
}
table(ga$learnAboutUNICEF)

# code dummy from learnAboutUNICEF;
# code 3 as 0, and 1 and 2 as Yes
ga$learnAboutUNICEF_dum <- ifelse(ga$learnAboutUNICEF==1|
                                    ga$learnAboutUNICEF==2,1,0)



###########################
##
## Creating the treatment variables
##
###########################

# There are 8 treatment groups, the order of which
# can be seen from the relevant survey copy to match
# each treatment (e.g., T1) to Shakira_Girls'Education_RealWorld
# and so on. 

# To code which is which here, we have simply used whether
# there is a value in one of the columns/questions that
# should only have something in it if the column
# corresponds to the relevant treatment. E.g., T1.2
# will only have text in it if the relevant respondent
# was in Treatment 1. And so on.

ga$Treat <- NA
for (i in 1:nrow(ga)){
  if(!is.na(ga$T1.2[i])){
    ga$Treat[i] <- "S_GE_Real"
  }
  if(!is.na(ga$T2.2[i])){
    ga$Treat[i] <- "S_GE_Hyp"
  }
  if(!is.na(ga$T3.2[i])){
    ga$Treat[i] <- "E_GE_Real"
  }
  if(!is.na(ga$T4.2[i])){
    ga$Treat[i] <- "E_GE_Hyp"
  }
  if(!is.na(ga$T5.2[i])){
    ga$Treat[i] <- "S_DC_Real"
  }
  if(!is.na(ga$T6.2[i])){
    ga$Treat[i] <- "S_DC_Hyp"
  }
  if(!is.na(ga$T7.2[i])){
    ga$Treat[i] <- "E_DC_Real"
  }
  if(!is.na(ga$T8.2[i])){
    ga$Treat[i] <- "E_DC_Hyp"
  }
}
table(ga$Treat)
#

# This baseline "Treat" variable is then used to create
# several more aggregate treatment variables that are
# used in the analyses in the paper. Those are created
# below:

# Creating dummy variable for "real" treatment using
# Treat created above; calling it Real_dum

ga$RvsH <- ifelse((ga$Treat=="S_GE_Hyp"|ga$Treat=="E_GE_Hyp"|ga$Treat=="E_DC_Hyp"|ga$Treat=="S_DC_Hyp"),"Hyp","Real")
table(ga$RvsH)
ga$Real_dum <- ifelse(ga$RvsH=="Real",1,0)
table(ga$Real_dum)


# Creating Shakira_GE for later use, which is 1 if the
# respondent read about Shakira supporting girls' education
ga$Shakira_GE <- ifelse((ga$Treat=="S_GE_Hyp"|ga$Treat=="S_GE_Real"),1,0)
table(ga$Shakira_GE)


# Creating Shakira_DC for later use, which is 1 if the
# respondent read about Shakira supporting displaced children
ga$Shakira_DC <- ifelse((ga$Treat=="S_DC_Hyp"|ga$Treat=="S_DC_Real"),1,0)
table(ga$Shakira_DC)

# Creating Expert_GE for later use, which is 1 if the
# respondent read about the Expert supporting girls' education
ga$Expert_GE <- ifelse((ga$Treat=="E_GE_Hyp"|ga$Treat=="E_GE_Real"),1,0)
table(ga$Expert_GE)

# Creating Shakira_GE for later use, which is 1 if the
# respondent read about the Expert supporting displaced children
ga$Expert_DC <- ifelse((ga$Treat=="E_DC_Hyp"|ga$Treat=="E_DC_Real"),1,0)
table(ga$Expert_DC)

# Creating GirlsEduc, which is 1 if the respondent read
# about girls' education and 0 if they read about displaced children
ga$GirlsEduc <- ifelse((ga$Shakira_GE==1|ga$Expert_GE==1),1,0)
table(ga$GirlsEduc)

# Creating Shakira, which is 1 if the respondent was in a Shakira
# treatment condition, and 0 if they were in an Expert condition
ga$Shakira <- ifelse((ga$Shakira_GE==1|ga$Shakira_DC==1),1,0)
table(ga$Shakira)

# Finally, creating Treat_4, which is coded by
# celebrity/expert-issue and used for balance tests
# make 4 treatment group variable for ANOVA for balance tests
ga$Treat_4 <- ifelse(ga$Treat=="E_DC_Hyp"|ga$Treat=="E_DC_Real","E_DC",
                     ifelse(ga$Treat=="E_GE_Hyp"|ga$Treat=="E_GE_Real","E_GE",
                            ifelse(ga$Treat=="S_DC_Hyp"|ga$Treat=="S_DC_Real","S_DC",
                                   ifelse(ga$Treat=="S_GE_Hyp"|ga$Treat=="S_GE_Real","S_GE","NA"))))
table(ga$Treat_4)







###########################
##
## Creating demographic/pre-treatment variables
##
###########################

# Creating gender variables
# D3: gender: 1=male (618), 2=female (496), 3=other (5), 4=PNS/DK (2)
table(ga$D3)

ga$male <- ifelse(ga$D3==1,1,0)
table(ga$male)
ga$female <- ifelse(ga$D3==2, 1,0)
table(ga$female)

# D4: creating age variable
ga$age <- ga$D4 
summary(ga$age)

# D5: creating Hispanic dummy variable
ga$His_dum <- ifelse(ga$D5==1,1,0)
table(ga$His_dum)

# D9: creating income variable
# measured in bands, where 1 is the lowest
table(ga$D9) 
# Coding dummy for income being low (i.e., less than 50k),
# which is the first/lowest two bands in the income_clean variable
ga$income_50kLess <- ifelse(ga$D9==1 |ga$D9==2,1,0)
table(ga$income_50kLess)



# Creating race variable, called white, which
# is a dummy variable
# D6: 1 is white, 2 is Black, 3 is Native Am, 4 is Asian, 5 is Other
table(ga$D6)
ga$white <- ifelse(ga$D6==1,1,0)
summary(ga$white)

table(ga$D6)
ga$black <- ifelse(ga$D6==2,1,0)
summary(ga$black)

table(ga$D6)
ga$asian <- ifelse(ga$D6==4,1,0)
summary(ga$asian)


# Creating three political ID variables:
# Dem, Repub, Indep using D7
table(ga$D7) 
ga$Repub <- ifelse(ga$D7==1|ga$D7==2,1,0)
ga$Indep <- ifelse(ga$D7==3|ga$D7==4|ga$D7==5,1,0)
ga$Dem <- ifelse(ga$D7==6|ga$D7==7,1,0)
table(ga$Repub)
table(ga$Indep)
table(ga$Dem)

# Creating college_deg dummy variable
# from D8
table(ga$D8) 
ga$college_deg <- ifelse(ga$D8==4|ga$D8==5,1,0)
table(ga$college_deg)

# Creating child dummy variable
# from D11

table(ga$D11) 
ga$child <- ifelse(ga$D11==1,1,0)
table(ga$child)

# Creating variable for how often people check the news
# 0: low, 1: medium, 2: high
# Creating news_check_num using D12
ga$news_check_num <- ifelse(ga$D12==4|ga$D12==5|ga$D12==6,0, 
                            ifelse(ga$D12==2|ga$D12==3,1,
                                   ifelse(ga$D12==1,2,NA)))
table(ga$news_check_num)


# Variable SC1 counts number of correctly identified countries
# and SC3 does the same for the organizations so they
# are both kept as-is for balance tests later
table(ga$SC1)
table(ga$SC3)

# Creating dummy variable incorrect, which indicates
# if the respondent was in the Shakira treatment
# and said in the end that they did NOT read
# about a celebrity in the study (F1==2)
ga$incorrect <- ifelse(ga$Shakira==1 & ga$F1==2,1,0)
table(ga$incorrect)


# Creating Dislike_Shak, which is a dummy
# coded 1 if the respondent disliked Shakira
ga$Dislike_Shak <- ifelse(ga$F2_4 < 6, 1,0)
table(ga$Dislike_Shak)

# Creating DK_Shak, which is a dummy coded 1
# for those who do not know who Shakira is
ga$DK_Shakira <- ifelse(ga$F2_4==12, 1,0)
table(ga$DK_Shakira)

# Creating neverDonate_Cause and neverDonate_UNICEF,
# which are dummies for those who would never donate
# to the given Cause or given organization
ga$neverDonate_Cause <- ifelse(ga$givetoCause==3,1,0)
table(ga$neverDonate_Cause)
ga$neverDonate_UNICEF <- ifelse(ga$givetoUNICEF==3,1,0)
table(ga$neverDonate_UNICEF)



#### subset the larger dataset to that actually
# used in analyses and save that
ga2 <- ga[c("amountToUNICEF", "givetoUNICEF_dum", "givetoCause_dum", 
            "learnAboutUNICEF_dum", "male", "age",
            "His_dum", "white", "Dem", "Indep", "Repub", 
            "college_deg", "income_50kLess", "Shakira", "GirlsEduc",
            "Real_dum", "Shakira_GE", "Shakira_DC", "female", 
            "Expert_GE", "Expert_DC", "Treat_4", "child",
            "news_check_num", "SC1", "SC3", "incorrect", 
            "DK_Shakira", "Dislike_Shak", "neverDonate_Cause",
            "neverDonate_UNICEF")]








############################
## Creating re-weighted dataset for
## Hispanic hypothesis robustness check
############################

# select variables that are needed
Hispanic_data <- select(ga,amountToUNICEF, givetoUNICEF_dum,
                        givetoCause_dum, learnAboutUNICEF_dum,
                        Shakira, His_dum, male,
                        age, white, Dem, Indep, college_deg, 
                        income_50kLess)

for(i in 1:nrow(Hispanic_data)){
  if(Hispanic_data$His_dum[i]==1){
    for(j in 1:ncol(Hispanic_data)) {
      Hispanic_data[i,j] <- Hispanic_data[i,j]*1.715
    }
  }
  if(Hispanic_data$His_dum[i]==0){
    for(j in 1:ncol(Hispanic_data)) {
      Hispanic_data[i,j] <- Hispanic_data[i,j]*0.9136
    }
  }
}

# save this as a .csv file
# add your own file path
#write.csv(ga2, "~/GA_ReplicationData.csv")
#write.csv(Hispanic_data, "~/GA_HispanicData.csv")

###############################################################################
############################   END   ##########################################
###############################################################################
